For each disease, we derive replicates of the mapping of RCTs across diseases after simulating what would have been the mapping of RCTs within regions if the misclassification of RCTs towards groups of diseases was corrected, given the sensitivities and specificities of the classifier to identify each group of disease.
To estimate the performances of the classifier for each group of diseases, we dispose a test set with 2,763 trials manually classified towards the 27-class grouping of diseases used in this work. The test set is described at Atal et al. BMC Bioinformatics 2016.
This script is for calculating sensitivity and specificity of the classifier to identify the disease and other studies relevant to the burden of diseases, and the number of success and number of trials to derive beta distributions
In [1]:
test_set <- read.table("/media/igna/Elements/HotelDieu/Cochrane/MetaMapBurden/Paper_classifier/NCT_data_classified_to28cats.txt")
dim(test_set)
In [2]:
#We supress injuries from trials concerning the burden of diseases
test_set$GBDnp <- sapply(strsplit(as.character(test_set$GBDnp),"&&"),function(x){paste(x[x!="28"],collapse="&")})
test_set$GBD28 <- sapply(strsplit(as.character(test_set$GBD28),"&"),function(x){paste(x[x!="28"],collapse="&")})
In [3]:
tst <- strsplit(test_set$GBDnp,"&")
alg <- strsplit(test_set$GBD28,"&")
tst <- lapply(tst,as.numeric)
alg <- lapply(alg,as.numeric)
In [4]:
source('Evaluation_metrics.R')
In [5]:
dis <- 1:27
Mgbd <- read.table("/home/igna/Desktop/Programs GBD/Classifier_Trial_GBD/Databases/Taxonomy_DL/GBD_data/GBD_ICD.txt")
In [6]:
#For each category in 1:27, TP, TN, FP and FN of finding the disease and of finding another disease
set.seed(7212)
dis <- as.character(1:27)
PERF_F <- data.frame()
for(i in dis){
ALG <- lapply(alg,function(x){rs <- c()
if(i%in%x) rs <- c(1)
if(sum(setdiff(dis,i)%in%x)!=0) rs <- c(rs,2)
return(rs)
})
DT <- lapply(tst,function(x){rs <- c()
if(i%in%x) rs <- c(1)
if(sum(setdiff(dis,i)%in%x)!=0) rs <- c(rs,2)
return(rs)
})
CM <- conf_matrix(ALG,DT,c(1,2))
PERF <- c(CM[1,],CM[2,])
PERF_F <- rbind(PERF_F,PERF)
}
In [7]:
#We add performances of classifier to identify trials relevant to the burden of diseases
ALG <- lapply(alg,length)
DT <- lapply(tst,length)
CM <- conf_matrix(ALG,DT,1)
PERF <- c(CM,rep(NA,4))
PERF_F <- rbind(PERF_F,PERF)
In [8]:
PERF_F <- data.frame(PERF_F)
names(PERF_F) <- paste(rep(c("TP","FP","TN","FN"),2),rep(c("_Dis","_Oth"),each=4),sep="")
In [9]:
PERF_F$dis <- c(dis,0)
PERF_F$GBD <- c(as.character(Mgbd$cause_name[-28]),"All")
In [10]:
PERF_F <- PERF_F[,c(9,10,1:8)]
In [11]:
PERF_F
In [12]:
write.csv(PERF_F,'Tables/Performances_per_27disease_data.csv')
In [ ]: